library(tidyverse) # For data manipulation and visualization
## Warning: package 'tidyverse' was built under R version 4.2.3
## Warning: package 'ggplot2' was built under R version 4.2.3
## Warning: package 'tibble' was built under R version 4.2.3
## Warning: package 'tidyr' was built under R version 4.2.3
## Warning: package 'readr' was built under R version 4.2.3
## Warning: package 'purrr' was built under R version 4.2.3
## Warning: package 'dplyr' was built under R version 4.2.3
## Warning: package 'stringr' was built under R version 4.2.3
## Warning: package 'forcats' was built under R version 4.2.3
## Warning: package 'lubridate' was built under R version 4.2.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.0 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readr) # For reading CSV data
library(lubridate) # For date manipulation
library(caret) # For creating training and test sets, and modeling
## Warning: package 'caret' was built under R version 4.2.3
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 4.2.3
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(plotly)
## Warning: package 'plotly' was built under R version 4.2.3
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
library(ggplot2)
library(readr)
library(RColorBrewer)
library(readxl)
## Warning: package 'readxl' was built under R version 4.2.3
library(scales)
## Warning: package 'scales' was built under R version 4.2.3
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
library(hexbin)
## Warning: package 'hexbin' was built under R version 4.2.3
library(viridis)
## Warning: package 'viridis' was built under R version 4.2.3
## Loading required package: viridisLite
## Warning: package 'viridisLite' was built under R version 4.2.3
##
## Attaching package: 'viridis'
##
## The following object is masked from 'package:scales':
##
## viridis_pal
party_dictionary <- data.frame(
Original = c("ACE","AKI","AIC","AIP","AMP","APF","AE","CIT","CMD","CMP","COM","CNC","CRV","CON","CST","COU","DCG","DNL","DEM","D/C","DFL","DGR","FED","FLP","FRE","GWP","GRT","GRE","GR","HRP","IDP","IND","IAP","ICD","IGR","IP","IDE","IGD","JCN","JUS","LRU","LBR","LFT","LBL","LIB","LBU","MTP","NDP","NLP","NA","NJC","NPP","NPA","NOP","NNE","N","NON","OE","OTH","PG","PSL","PAF","PFP","PFD","POP","PPY","PCH","PPD","PRO","NAP","PRI","RUP","REF","REP","RES","RTL","SEP","SLP","SUS","SOC","SWP","TX","TWR","TEA","THD","LAB","USP","UST","UN","UC","UNI","UNK","VET","WTP","W"),
Label = c("Ace Party","Alaskan Independence Party","American Independent Conservative","American Independent Party","American Party","American People's Freedom Party","Americans Elect","Citizens' Party","Commandments Party","Commonwealth Party of the U.S.","Communist Party","Concerned Citizens Party Of Connecticut","Conservative Party","Constitution Party","Constitutional","Country","D.C. Statehood Green Party","Democratic -Nonpartisan League","Democratic Party","Democratic/Conservative","Democratic-Farmer-Labor","Desert Green Party","Federalist","Freedom Labor Party","Freedom Party","George Wallace Party","Grassroots","Green Party","Green-Rainbow","Human Rights Party","Independence Party","Independent","Independent American Party","Independent Conservative Democratic","Independent Green","Independent Party","Independent Party of Delaware","Industrial Government Party","Jewish/Christian National","Justice Party","La Raza Unida","Labor Party","Less Federal Taxes","Liberal Party","Libertarian Party","Liberty Union Party","Mountain Party","National Democratic Party","Natural Law Party","New Alliance","New Jersey Conservative Party","New Progressive Party","No Party Affiliation","No Party Preference","None","Nonpartisan","Non-Party","One Earth Party","Other","Pacific Green","Party for Socialism and Liberation","Peace And Freedom","Peace And Freedom Party","Peace Freedom Party","People Over Politics","People's Party","Personal Choice Party","Popular Democratic Party","Progressive Party","Prohibition Party","Puerto Rican Independence Party","Raza Unida Party","Reform Party","Republican Party","Resource Party","Right To Life","Socialist Equality Party","Socialist Labor Party","Socialist Party","Socialist Party U.S.A.","Socialist Workers Party","Taxpayers","Taxpayers Without Representation","Tea Party","Theo-Democratic","U.S. Labor Party","U.S. People's Party","U.S. Taxpayers Party","Unaffiliated","United Citizen","United Party","Unknown","Veterans Party","We the People","Write-In")
)
cand_data <- read.csv2(file = "C:/Users/18137/Downloads/candidate_summary_2020.csv", sep = "," , )
cand_data$Total_Contribution <- as.numeric(cand_data$Total_Contribution)
cand_data$Total_Receipt <- as.numeric(cand_data$Total_Receipt)
cand_data <- cand_data %>%
left_join(party_dictionary, by = c("Cand_Party_Affiliation" = "Original")) %>%
# Optionally, you might want to remove the original column and rename the new one
select(-Cand_Party_Affiliation) %>%
rename(Cand_Party_Affiliation = Label)
election_results <- read_excel("C:/Users/18137/Downloads/federalelections2020.xlsx")
## New names:
## • `` -> `...2`
senate_results <- read_excel("C:/Users/18137/Downloads/federalelections2020.xlsx", sheet = "12. US Senate Results by State")
house_results <- read_excel("C:/Users/18137/Downloads/federalelections2020.xlsx", sheet = "13. US House Results by State")
pres_primary_results <- read_excel("C:/Users/18137/Downloads/federalelections2020.xlsx", sheet = "9. 2020 Pres General Results")
senate_results <- senate_results %>%
mutate(Win_Lose = as.numeric('GE Winner Indicator' == "W"))
house_results <- house_results %>%
mutate(Win_Lose = as.numeric('GE Winner Indicator' == "W"))
pres_primary_results <- pres_primary_results %>%
mutate(Win_Lose = as.numeric('Winner_Indicator' == "W"))
cand_data <- cand_data %>% group_by(Cand_Party_Affiliation) %>% filter(Total_Receipt > 0)
all_election_results <- bind_rows(senate_results, house_results, pres_primary_results)
final_data <- cand_data %>%
left_join(all_election_results, by = c("Cand_Id" = "FEC ID"))
final_data$Win_Lose[is.na(final_data$Win_Lose)] <- 0
head(final_data)
merged_data <- final_data %>%
mutate(Total_Contribution = if_else(is.na(Total_Contribution), median(Total_Contribution, na.rm = TRUE), Total_Contribution))
summary(merged_data)
## Link_Image Cand_Name Cand_Id Cand_Office
## Length:2938 Length:2938 Length:2938 Length:2938
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Cand_Office_St Cand_Office_Dist Cand_Incumbent_Challenger_Open_Seat
## Length:2938 Min. : 0.000 Length:2938
## Class :character 1st Qu.: 0.000 Class :character
## Mode :character Median : 4.000 Mode :character
## Mean : 7.642
## 3rd Qu.:10.750
## Max. :53.000
##
## Total_Receipt Total_Disbursement Cash_On_Hand_COP
## Min. :1.000e+00 Length:2938 Length:2938
## 1st Qu.:2.124e+04 Class :character Class :character
## Median :1.587e+05 Mode :character Mode :character
## Mean :3.534e+07
## 3rd Qu.:1.560e+06
## Max. :1.125e+09
##
## Debt_Owed_By_Committee Coverage_End_Date Cand_Street_1
## Length:2938 Length:2938 Length:2938
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## Cand_Street_2 Cand_City Cand_State Cand_Zip
## Length:2938 Length:2938 Length:2938 Min. : 0
## Class :character Class :character Class :character 1st Qu.: 22314
## Mode :character Mode :character Mode :character Median : 48216
## Mean : 26963948
## 3rd Qu.: 82414
## Max. :995100298
## NA's :144
## Individual_Itemized_Contribution Individual_Unitemized_Contribution
## Length:2938 Length:2938
## Class :character Class :character
## Mode :character Mode :character
##
##
##
##
## Individual_Contribution Other_Committee_Contribution
## Length:2938 Length:2938
## Class :character Class :character
## Mode :character Mode :character
##
##
##
##
## Party_Committee_Contribution Cand_Contribution Total_Contribution
## Length:2938 Length:2938 Min. :0.000e+00
## Class :character Class :character 1st Qu.:1.254e+04
## Mode :character Mode :character Median :9.946e+04
## Mean :2.524e+07
## 3rd Qu.:1.226e+06
## Max. :1.090e+09
##
## Transfer_From_Other_Auth_Committee Cand_Loan Other_Loan
## Length:2938 Length:2938 Length:2938
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## Total_Loan Offsets_To_Operating_Expenditure Offsets_To_Fundraising
## Length:2938 Length:2938 Length:2938
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## Offsets_To_Leagal_Accounting Other_Receipts Operating_Expenditure
## Length:2938 Length:2938 Length:2938
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## Exempt_Legal_Accounting_Disbursement Fundraising_Disbursement
## Length:2938 Length:2938
## Class :character Class :character
## Mode :character Mode :character
##
##
##
##
## Transfer_To_Other_Auth_Committee Cand_Loan_Repayment Other_Loan_Repayment
## Length:2938 Length:2938 Length:2938
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## Total_Loan_Repayment Individual_Refund Party_Committee_Refund
## Length:2938 Length:2938 Length:2938
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## Other_Committee_Refund Total_Contribution_Refund Other_Disbursements
## Length:2938 Length:2938 Length:2938
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## Net_Contribution Net_Operating_Expenditure Cash_On_Hand_BOP
## Length:2938 Length:2938 Length:2938
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## Debt_Owe_To_Committee Coverage_Start_Date Cand_Party_Affiliation
## Length:2938 Length:2938 Length:2938
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## 1 STATE ABBREVIATION STATE DISTRICT
## Min. : 3.0 Length:2938 Length:2938 Length:2938
## 1st Qu.: 390.8 Class :character Class :character Class :character
## Median :1102.0 Mode :character Mode :character Mode :character
## Mean :1511.8
## 3rd Qu.:2579.2
## Max. :4036.0
## NA's :574
## (I) Incumbent Indicator CANDIDATE NAME (First) CANDIDATE NAME (Last)
## Length:2938 Length:2938 Length:2938
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## CANDIDATE NAME TOTAL VOTES PARTY PRIMARY VOTES
## Length:2938 Length:2938 Length:2938 Length:2938
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## PRIMARY % RUNOFF VOTES RUNOFF % GENERAL VOTES
## Min. :0.0001 Min. : 1570 Min. :0.2405 Length:2938
## 1st Qu.:0.1105 1st Qu.: 8528 1st Qu.:0.4304 Class :character
## Median :0.3107 Median : 16102 Median :0.5116 Mode :character
## Mean :0.4042 Mean : 48039 Mean :0.5072
## 3rd Qu.:0.6751 3rd Qu.: 33884 3rd Qu.:0.5937
## Max. :1.0000 Max. :502516 Max. :0.7595
## NA's :1344 NA's :2888 NA's :2888
## GENERAL % GE RUNOFF ELECTION VOTES (GA, LA)
## Min. :0.0000 Min. :2195841
## 1st Qu.:0.0135 1st Qu.:2210195
## Median :0.3874 Median :2242451
## Mean :0.3436 Mean :2242464
## 3rd Qu.:0.5628 3rd Qu.:2274721
## Max. :1.0000 Max. :2289113
## NA's :1472 NA's :2934
## GE RUNOFF ELECTION % (GA, LA) COMBINED GE PARTY TOTALS (when applicable)
## Min. :0.4896 Mode:logical
## 1st Qu.:0.4928 NA's:2938
## Median :0.5000
## Mean :0.5000
## 3rd Qu.:0.5072
## Max. :0.5104
## NA's :2934
## COMBINED % (when applicable) PE WINNER INDICATOR GE WINNER INDICATOR
## Mode:logical Length:2938 Length:2938
## NA's:2938 Class :character Class :character
## Mode :character Mode :character
##
##
##
##
## FOOTNOTES Win_Lose GE RUNOFF ELECTION VOTES (GA, GU, LA)
## Length:2938 Min. :0 Min. : 7090
## Class :character 1st Qu.:0 1st Qu.: 9623
## Mode :character Median :0 Median :20296
## Mean :0 Mean :24216
## 3rd Qu.:0 3rd Qu.:34889
## Max. :0 Max. :49183
## NA's :2934
## GE RUNOFF ELECTION % (GA, GU, LA) COMBINED GE PARTY TOTALS (CT, NY)
## Min. :0.3798 Min. : 3164
## 1st Qu.:0.3978 1st Qu.:149449
## Median :0.5000 Median :181021
## Mean :0.5000 Mean :170471
## 3rd Qu.:0.6022 3rd Qu.:206310
## Max. :0.6202 Max. :234933
## NA's :2934 NA's :2836
## COMBINED % (CT, NY) GENERAL ELECTION DATE FIRST NAME
## Min. :0.0094 Min. :2020-11-03 Length:2938
## 1st Qu.:0.4321 1st Qu.:2020-11-03 Class :character
## Median :0.5445 Median :2020-11-03 Mode :character
## Mean :0.5273 Mean :2020-11-03
## 3rd Qu.:0.5929 3rd Qu.:2020-11-03
## Max. :0.9079 Max. :2020-11-03
## NA's :2836 NA's :2516
## LAST NAME LAST NAME, FIRST GENERAL RESULTS TOTAL VOTES #
## Length:2938 Length:2938 Min. : 1 Min. : NA
## Class :character Class :character 1st Qu.: 181 1st Qu.: NA
## Mode :character Mode :character Median : 3404 Median : NA
## Mean : 376663 Mean :NaN
## 3rd Qu.: 80633 3rd Qu.: NA
## Max. :11110639 Max. : NA
## NA's :2518 NA's :2938
## COMBINED GE PARTY TOTALS (NY) COMBINED % (NY) WINNER INDICATOR
## Length:2938 Min. :0.3774 Length:2938
## Class :character 1st Qu.:0.3774 Class :character
## Mode :character Median :0.4930 Mode :character
## Mean :0.4930
## 3rd Qu.:0.6087
## Max. :0.6087
## NA's :2932
## ELECTORAL VOTES
## Length:2938
## Class :character
## Mode :character
##
##
##
##
# Ensure Win_Lose is a factor for logistic regression
merged_data$Win_Lose <- as.factor(merged_data$Win_Lose)
# Fit the model
model <- glm(Win_Lose ~ Total_Contribution, data = merged_data, family = binomial(),control = glm.control(maxit = 10000))
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# Check the summary
summary(model)
##
## Call:
## glm(formula = Win_Lose ~ Total_Contribution, family = binomial(),
## data = merged_data, control = glm.control(maxit = 10000))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.107e-08 -2.107e-08 -2.107e-08 -2.107e-08 -2.107e-08
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.057e+01 4.951e+04 -0.001 1
## Total_Contribution -1.963e-22 3.847e-04 0.000 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 0.0000e+00 on 2937 degrees of freedom
## Residual deviance: 1.3047e-12 on 2936 degrees of freedom
## AIC: 4
##
## Number of Fisher Scoring iterations: 29
set.seed(123) # for reproducibility
training_samples <- createDataPartition(merged_data$Win_Lose, p = 0.8, list = FALSE)
train_data <- merged_data[training_samples, ]
test_data <- merged_data[-training_samples, ]
# Predict on test set
predictions <- predict(model, test_data, type = "response")
predicted_class <- if_else(predictions > 0.5, 1, 0)
# Adjusting the sampling method to ensure stratification
training_samples <- createDataPartition(merged_data$Win_Lose, p = 0.8, list = TRUE, times = 1)
train_data <- merged_data[training_samples[[1]], ]
test_data <- merged_data[-training_samples[[1]], ]
test_data$Win_Lose <- factor(test_data$Win_Lose, levels = c("0", "1"))
# Ensure predicted_class is a factor with both levels
predicted_class <- factor(predicted_class, levels = c("0", "1"))
confusionMatrix(predicted_class, test_data$Win_Lose)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 587 0
## 1 0 0
##
## Accuracy : 1
## 95% CI : (0.9937, 1)
## No Information Rate : 1
## P-Value [Acc > NIR] : 1
##
## Kappa : NaN
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1
## Specificity : NA
## Pos Pred Value : NA
## Neg Pred Value : NA
## Prevalence : 1
## Detection Rate : 1
## Detection Prevalence : 1
## Balanced Accuracy : NA
##
## 'Positive' Class : 0
##
# Calculate accuracy or other performance metrics
accuracy <- sum(predicted_class == test_data$Win_Lose) / nrow(test_data)
print(paste("Accuracy:", accuracy))
## [1] "Accuracy: 1"
conTotal <- cand_data$Total_Contribution
fig1 <- cand_data %>%
ggplot(aes(conTotal, Cand_State)) + geom_col(aes(fill = cand_data$Cand_State)) + scale_x_continuous(labels = scales::label_dollar(prefix = "$"))
ggplotly(fig1)
cand_by_affiliation_Total_Receipt <- cand_data %>%
group_by(Cand_Party_Affiliation) %>%
filter(Total_Receipt > 0) %>%
arrange(Total_Receipt)
cand_by_affiliation_Total_Receipt <- data.frame(cand_by_affiliation_Total_Receipt$Cand_Party_Affiliation, cand_by_affiliation_Total_Receipt$Total_Receipt)
arg_data <- aggregate(cand_by_affiliation_Total_Receipt$cand_by_affiliation_Total_Receipt.Total_Receipt,
list(Category = cand_by_affiliation_Total_Receipt$cand_by_affiliation_Total_Receipt.Cand_Party_Affiliation),
sum)
arg_data <- arg_data %>% top_n(2)
## Selecting by x
ggplot(arg_data, aes(x = Category, y = x)) + geom_col(aes(fill = Category)) + scale_y_continuous(labels = scales::label_dollar(prefix = "$", suffix = ""))

# Assuming 'cand_data' is your data frame and it has been loaded into your R session
distinct_parties <- cand_data %>%
select(Cand_Party_Affiliation) %>% # Select the column of interest
distinct() %>% # Get distinct/unique entries
arrange(Cand_Party_Affiliation) # Optional: arrange them alphabetically
# To view the distinct party affiliations
print(distinct_parties)
## # A tibble: 28 × 1
## # Groups: Cand_Party_Affiliation [28]
## Cand_Party_Affiliation
## <chr>
## 1 Conservative Party
## 2 Constitution Party
## 3 Democratic Party
## 4 Democratic-Farmer-Labor
## 5 Green Party
## 6 Human Rights Party
## 7 Independence Party
## 8 Independent
## 9 Independent American Party
## 10 Independent Conservative Democratic
## # ℹ 18 more rows
receipt_data <- cand_data %>%
group_by(Cand_Party_Affiliation) %>%
summarize(Total_Receipt = sum(Total_Receipt, na.rm = TRUE)) %>%
filter(Total_Receipt > 0) %>%
ggplot(aes(x = Cand_Party_Affiliation, y = Total_Receipt, fill = Cand_Party_Affiliation)) +
geom_bar(stat = "identity") +
scale_fill_viridis(discrete = TRUE, option = "M") +
scale_y_log10(labels = label_dollar(scale = 1)) +
labs(x = "Party Affiliation",
y = "Log of Total Receipts (Dollars)",
title = "Total Receipts by Party Affiliation") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
plotly_receipt_data <- ggplotly(receipt_data)
## Warning in viridisLite::viridis(n, alpha, begin, end, direction, option):
## Option 'M' does not exist. Defaulting to 'viridis'.
plotly_receipt_data
Congressional_data <- cand_data %>%
group_by(Cand_Office) %>%
filter(Cand_Office != "P") %>%
arrange(Total_Contribution)
Congressional_data <- aggregate(Congressional_data$Total_Contribution,
list(Category = Congressional_data$Cand_Office ),
sum)
president_data <- cand_data %>%
group_by(Cand_Office) %>%
filter(Cand_Office == "P") %>%
arrange(Total_Contribution)
Congressional_data %>% ggplot(aes(x = x, y = Category)) + geom_col(aes(fill = Category)) + scale_x_continuous(label = scales::label_dollar(prefix = "$"))

fig1 <- cand_data %>%
group_by(Cand_State) %>%
summarise(Total_Contribution = sum(Total_Contribution)) %>%
filter(Total_Contribution > 0) %>% # Filtering out zeros
ggplot(aes(x = Cand_State, y = Total_Contribution, fill = Cand_State)) +
geom_bar(stat = "identity") +
scale_fill_viridis(discrete = TRUE, option = "D") + # Using viridis for discrete color mapping
scale_y_log10(labels = scales::label_dollar(scale = 1)) + # Formatting y-axis as dollar values
theme_minimal() +
labs(title = "Total Contributions by State",
y = "Log of Total Contributions (Dollars)",
x = "State") +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) # Vertical x-axis labels
# Convert to Plotly interactive plot
plotly_fig1 <- ggplotly(fig1)
plotly_fig1